# Copyright 2020 The SQLFlow Authors. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import sys
import time

from spellchecker import SpellChecker


def spell_check():
    spell = SpellChecker()
    spell.word_frequency.load_words(known)
    num = 0
    for line in sys.stdin.readlines():
        num += 1
        words = list(
            filter(
                lambda w: w.lower() not in known and w[0] not in "'0123456789"
                and len(w) > 4 and all(ord(c) < 128 for c in w),
                re.findall(r"[\w']+", line)))
        known.update(map(lambda w: w.lower(), words))
        print('#', num, end='\r')
        time.sleep(0.002)
        for identifier in words:
            misspelled = spell.unknown(
                re.split(r"(?<=[a-z0-9])(?=[A-Z])|_|(?<=[A-Z])(?=[A-Z][a-z])",
                         identifier))
            # Get the one `most likely` answer
            for word in misspelled:
                print('#', num, identifier, '\t',
                      "s/%s/%s" % (word, spell.correction(word)))


known = set([
    "",
    "''",
    "_",
    "accessid",
    "accesskey",
    "adagrad",
    "addr",
    "alibaba",
    "alibabacloud",
    "alipay",
    "alisa",
    "aliyun",
    "analytics",
    "antfin",
    "antxgboost",
    "argmax",
    "argoproj",
    "argparse",
    "args",
    "asaskevich",
    "asynchronously",
    "atexit",
    "atoi",
    "attr",
    "attrs",
    "backends",
    "base64",
    "base64",
    "batchsize",
    "bigint",
    "bigquery",
    "bigqueryml",
    "blog",
    "bool",
    "break",
    "bucketized",
    "bufio",
    "builtin",
    "cannot",
    "case",
    "cfg",
    "chan",
    "classifer",
    "cmd",
    "codegen",
    "coinflip",
    "col1",
    "col2",
    "colspan",
    "combiner",
    "completer",
    "concat",
    "cond",
    "conf",
    "connectable",
    "const",
    "continue",
    "couler",
    "creditcardfraud",
    "crypto",
    "css",
    "csv",
    "cwd",
    "datasource",
    "dataworks",
    "datetime",
    "dbapi",
    "dbmses",
    "debug",
    "debugf",
    "deepcopy",
    "default",
    "defer",
    "delim",
    "desc",
    "dict",
    "dir",
    "dnn",
    "dockerfile",
    "dockerfiles",
    "docstring",
    "docstring",
    "doesn't",
    "downloads",
    "dsn",
    "dtype",
    "elasticdl",
    "elasticdlpredict",
    "elasticdltrain"
    "elem",
    "elif",
    "else",
    "emacs",
    "emb",
    "end2",
    "enum",
    "env",
    "environ",
    "environ",
    "envs",
    "eof",
    "errorf",
    "estimator",
    "etype",
    "eval",
    "eval",
    "explainer",
    "expr",
    "extensibility",
    "failf",
    "fallthrough",
    "fatalf",
    "fea1",
    "fea2",
    "filepath",
    "fixme",
    "fld",
    "flds",
    "float32",
    "float64",
    "fmt",
    "for",
    "fprintf",
    "func",
    "getenv",
    "getpid",
    "getpid",
    "github",
    "gitlab",
    "go",
    "godoc",
    "gohive",
    "golang",
    "gomaxcompute",
    "google",
    "googleapis",
    "goreportcard",
    "goroutine",
    "goroutines",
    "goto",
    "govalidator",
    "goyacc",
    "goyacc",
    "grads",
    "gz",
    "gzipped",
    "hacky",
    "hadoop",
    "hangzhou",
    "hdfs",
    "hiveql",
    "http",
    "https",
    "hyperparameter",
    "hyperparameters",
    "ident",
    "idx",
    "if",
    "igroup",
    "img",
    "import",
    "importlib",
    "init",
    "initialize",
    "initializer",
    "int",
    "int16",
    "int32",
    "int64",
    "int8",
    "intellij",
    "interface",
    "ints",
    "ioutil",
    "ishape",
    "isinstance",
    "it's",
    "it2check",
    "iter",
    "iterator",
    "joyyoj",
    "json",
    "jupyter",
    "k8s",
    "kaggle",
    "keras",
    "kmeans",
    "kubectl",
    "kubemaker",
    "kubernetes",
    "kwargs",
    "lexer",
    "lexer",
    "lexers",
    "lexing",
    "lhw362950217"
    "linkedin",
    "linux",
    "localhost",
    "logits",
    "logview",
    "lookahead",
    "lstm",
    "map",
    "markdown",
    "matplotlib",
    "mattn",
    "maven",
    "maxcompute",
    "metadata",
    "minibatch",
    "minibatches",
    "mkdir",
    "mutex",
    "mysql",
    "mysqldb",
    "mytable",
    "namenode",
    "namespace",
    "namespaces",
    "nowf",
    "numpy",
    "odps",
    "odpscmd",
    "olekukonko",
    "oneof",
    "openssl",
    "optimizer",
    "oss",
    "ossid",
    "osxfs",
    "overfitting",
    "package",
    "panicf",
    "param",
    "params",
    "passwd",
    "paypal",
    "pbtxt",
    "pkg",
    "plt",
    "plugin",
    "pred",
    "prefetch",
    "premade",
    "preprocessing",
    "pretrain",
    "prev",
    "printf",
    "println",
    "proto3",
    "protobuf",
    "protoc",
    "py",
    "py2",
    "py3",
    "pyodps",
    "pypi",
    "pyplot",
    "python's",
    "python3",
    "range",
    "readline",
    "readthedocs",
    "reentrant",
    "refman",
    "regex",
    "regexp",
    "regressor",
    "repl",
    "resp",
    "restful",
    "return",
    "rmse",
    "rmsprop",
    "select",
    "sepal",
    "serialize",
    "serialized",
    "setenv",
    "sgd",
    "shendiaomo",
    "sixel",
    "skipf",
    "slct",
    "softmax",
    "splitted",
    "sprintf",
    "sql",
    "sqlflow",
    "sqlflowserver",
    "sqlfs",
    "sqlite",
    "sqlite's",
    "sqlite's",
    "sqlite3",
    "squarederror",
    "stackoverflow",
    "staticmethod",
    "stddev",
    "stderr",
    "stdin",
    "stdout",
    "stmt",
    "str",
    "strconv",
    "stretchr",
    "struct",
    "structs",
    "submitters",
    "subprocess",
    "substring",
    "substrings",
    "svail",
    "switch",
    "sym",
    "syscall",
    "tablewriter",
    "tarball",
    "tarfile",
    "tensorboard",
    "tensorflow",
    "testdata",
    "textfile",
    "tf",
    "thenewstack",
    "timelines"
    "timeout",
    "timeout",
    "timeouts",
    "timestamp",
    "todo",
    "tolist",
    "tonyyang",
    "tpp",
    "txt",
    "typ",
    "type",
    "typhoonzero",
    "uint",
    "uint16",
    "uint32",
    "uint64",
    "uint8",
    "unary",
    "unicode",
    "unittest",
    "universalize",
    "unmarshal",
    "untar",
    "upload",
    "url",
    "usercenter",
    "utf",
    "utf8",
    "util",
    "utils",
    "uuleon",
    "var",
    "varbinary",
    "varchar",
    "variadic",
    "varint",
    "walkthrough",
    "wangkuiyi",
    "warnln",
    "wechat",
    "weiguo",
    "weiguoz",
    "what's",
    "whitespace",
    "whitespaces",
    "wikipedia",
    "workflows",
    "wronly",
    "www",
    "xflow",
    "xg",
    "xgb",
    "xgboost",
    "yacc",
    "yancey1989",
    "youtube",
])

if __name__ == "__main__":
    if "-h" in sys.argv:
        print(f"usage: python {sys.argv[0]} < file")
    else:
        spell_check()
